home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Power Programmierung
/
Power-Programmierung CD 2 (Tewi)(1994).iso
/
doc
/
mir
/
a_occur.c
< prev
next >
Wrap
Text File
|
1992-07-02
|
8KB
|
240 lines
/*
* usage - a_occur [ min_freq ] [ /n ] < ascii_text > report
* /n = non-sequenced data is okay
*
* A_OCCUR Count the frequency of occurrence of identical lines
* If a minimum frequency is specified, lines occurring
* fewer times are dropped entirely from the result.
*
* Input: ASCII text, which must be in sorted order UNLESS the
* flag "/n" is included.
*
* Output: A reduced copy of the file with each line shown only
* once. Each line begins with a frequency count, padded
* out to six characters with blanks.
*
* Writeup: MIR TUTORIAL ONE, topic five.
* See also the related programs A_OCCUR2 and A_OCCUR3.
*
* Written: Douglas Lowry Mar 04 87
* Modified: Douglas Lowry Apr 30 92 Reworked entirely
* Copyright (C) 1992 Innotech Inc.
*
* The MIR (Mass Indexing and Retrieval) Tutorials explain detailed
* usage and co-ordination of the MIR family of programs to analyze,
* prepare and index databases (small through gigabyte size), and
* how to build integrated retrieval software around the MIR search
* engine. The fifth of the five MIR tutorial series explains how
* to extend indexing capability into leading edge search-related
* technologies. For more information, GO IBMPRO on CompuServe;
* MIR files are in the DBMS library. The same files are on the
* Canada Remote Systems BBS. A diskette copy of the Introduction
* is available by mail ($10 US... check, Visa or Mastercard);
* diskettes with Introduction, Tutorial ONE software and the
* shareware Tutorial ONE text cost $29. Shareware registration
* for a tutorial is also $29.
*
* E-mail...
* Compuserve 71431,1337
* Internet doug.lowry%canrem.com
* UUCP canrem!doug.lowry
* Others: doug.lowry@canrem.uucp
*
* FAX... 416 963-5677
*
* "Snail mail"... Douglas Lowry, Ph.D.
* Marpex Inc.
* 5334 Yonge Street, #1102
* North York, Ontario
* Canada M2N 6M2
*
* Related database consultation and preparation services are
* available through:
* Innotech Inc., 2001 Sheppard Avenue E., Suite #118,
* North York, Ontario Canada M2J 4Z7
* Tel. 416 492-3838 FAX 416 492-3843
*
* This program is free software; you may redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* (file 05LICENS) along with this program; if not, write to the
* Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
* USA.
*/
#include <stdio.h>
#include <stdlib.h>
#define MAX_BYTES 512
#define repeat for(;;)
/*
* declarations
*/
typedef enum _bool
{ FALSE = 0, TRUE = 1 } Bool;
void Usage_(), process();
char *Cmdname_() { return( "a_occur" ); }
/*
* MAIN
*/
main( argc, argv )
int argc;
char **argv;
{
Bool must_be_seq ; /* Must be sequential ASCII order */
int min_freq, /* threshold frequency to show a line */
i, val;
min_freq = 1;
must_be_seq = TRUE;
if( argc > 3 )
Usage_();
for( i = 1 ; i < argc ; i++ )
{
if(( val = atoi( argv[i] )))
min_freq = val;
else if(( argv[i][0] == '-' || argv[i][0] == '/' ) &&
( argv[i][1] == 'n' || argv[i][1] == 'N' ))
must_be_seq = FALSE;
else
Usage_();
}
process( min_freq, must_be_seq ) ;
exit( 0 ) ;
}
/*
* Usage_
*/
void
Usage_()
{
fprintf( stderr,
"\nusage: %s [ min_freq ] [ /n ] < ascii_text > report\n\
/n = non-sequenced data is okay\n\n\
Count the frequency of occurrence of identical lines\n\
If a minimum frequency is specified, lines occurring\n",
Cmdname_() );
fprintf( stderr,
" fewer times are dropped entirely from the result.\n\n\
Input: ASCII text, which must be in sorted order UNLESS the\n\
flag \"/n\" is included.\n\n\
Output: A reduced copy of the file with each line shown only\n\
once. Each line begins with a frequency count, padded\n" );
fprintf( stderr,
" out to six characters with blanks.\n\n\
Writeup: MIR TUTORIAL ONE, topic five.\n\
See also the related programs A_OCCUR2 and A_OCCUR3.\n\n" ) ;
exit( 1 ) ;
}
/*
* PROCESS
*/
void
process( min_freq, must_be_seq )
int min_freq ;
Bool must_be_seq ; /* must be sequential order (default) */
{
char buf[2][MAX_BYTES]; /* alternating line inputs */
Bool done, /* last line has been read */
same; /* 2 successive lines identical */
long int
freq, /* count of occurrences of line */
sizer ;
int this, /* current buffer is 0 or 1 */
that, /* other buffer is 1 or 0 */
lines_in, /* count */
len[2], /* line length of each buffer */
i ;
len[0] = len[1] = freq = lines_in = 0;
done = FALSE;
this = 0;
that = 1;
while( !done )
{
if( fgets( buf[this], MAX_BYTES, stdin ) == NULL )
done = TRUE;
lines_in++ ;
len[this] = strlen( buf[this] ) - 1 ;
while( isspace( buf[this][len[this]-1] ))
len[this] -= 1 ;
if( len[this] > MAX_BYTES - 3 )
{
fprintf( stderr, "FATAL... Line length exceeds %d bytes.\n\n",
MAX_BYTES ) ;
exit( 1 ) ;
}
buf[this][len[this]] = '\0' ;
if( done || len[this] < 0 )
len[this] = 0;
same = FALSE; /* compare 2 consecutive lines */
if( len[this] == len[that] )
{
same = TRUE;
for( i = 0; i < len[0]; i++ )
{
if( buf[0][i] != buf[1][i] )
{
same = FALSE;
if( must_be_seq && buf[this][i] < buf[that][i] )
{
fprintf( stderr,
"Not sorted... lines %d and %d\n%s\n%s\n", lines_in - 1, lines_in,
buf[this], buf[that] );
Usage_();
}
break;
}
}
}
if( same )
freq++;
else /* if not same, print */
{
if( freq >= min_freq )
{
printf( "%d", freq ) ;
sizer = freq ;
while( sizer < 100000 )
{
putchar( ' ' ) ;
sizer *= 10 ;
}
if( !printf( "%s\n", buf[that] ))
{
fprintf( stderr, "FATAL... unable to write.\n\n" ) ;
exit( 1 ) ;
}
}
freq = 1;
this = that;
if( this )
that = 0;
else
that = 1;
}
}
return;
}